---
title: "Language Pilot Results"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)
```

```{r}
library(estimatr)
library(lmtest)
library(Matching)
library(MatchIt)
library(sandwich)
library(tidyverse)
source("analysis_utils.R")
```

```{r}
# This code references case and pilot data from Santa Clara County's
# contact tracing system.
cases_all <- read_csv("../data/cases_all.csv")
pilot <- read_csv("/share/pi/deho/lang_pilot_updated.csv") %>%
  transmute(
    ID = `ID/Record #`,
    `CICT Staff Name`,
    completed_interview = as.numeric(toupper(`Interview Completed?`) == "YES"),
    refuse_interview = as.numeric(
      `Interview Completed?` %in% c("Refused", "Refused to interview")
    ),
    marked_spanish_speaker = as.numeric(!is.na(`Is patient Spanish speaker?`)),
    is_spanish_speaker = as.numeric(
      (
        `Is patient Spanish speaker?` %in% c(
          "yes",
          "Yes",
          "yes but not fluent"
        )
      ) &
      (
        `Was Spanish used during interview?` %in% c(
          "Yes",
          "Started in English, completed in Spanish",
          "Yes with Interpreter"
        )
      ) |
      (
        `If no, what language was interview in?` == "Spanish"
      )
    ),
    interp_used = as.numeric(
      (
        `Was Spanish used during interview?` == "Yes with Interpreter"
      ) |
      (
        `If no, what language was interview in?` == "Through Interpreter"
      ) |
      str_detect(
        tolower(Notes),
        "translator|translater|interpret|language line"
      )
    )
  )
pilot[33, ]$ID <- 614867 # Manually adding in missing CalConnect ID.
cases_all <- cases_all %>%
  left_join(pilot, by = "ID")
```

```{r}
# import cumulative CICT file and define outcome variables of interest
# most recent CICT file with all CalConnect fields of interest:
# cict_all_031721_0802.csv
df_cict <- read_csv("/share/pi/deho/2021-03-17/cict_all_031721_0802.csv") %>%
              dplyr::select(Record.Number, Record.Owner, Status,
                     Opened.Date, Date.Time.Opened,
                     Closed.Date, Date.Time.Closed, Closed.Reason,
                     Closed.Reason..If.Other..Please.Specify,
                     Initial.Interview.Outcome, Interview.Completed.Date,
                     Reason.for.inability.to.speak.now,
                     Reason.for.inability.to.speak.now..Other,
                     DO.YOU.HAVE.CLOSE.CONTACTS.,
                     If.yes..total.number.of.close.contacts.,
                     Case.unwilling.unable.to.name.contacts.,
                     Notes.for.CalREDIE.PUI.tab..,
                     Person.Account..Gender..,
                     Person.Account..Language..) %>%
              rename(
                Notes = Notes.for.CalREDIE.PUI.tab..,
              ) %>%
              mutate(
                ID = as.numeric(`Record.Number`),
                interview_outcome =
                  case_when(
                    Initial.Interview.Outcome == "Completed" &
                      Closed.Reason == "Declined" ~ "Refused to interview",
                    Initial.Interview.Outcome == "Completed" &
                      Closed.Reason == "Was never reached" ~
                      "Lost to Follow-Up",
                    Initial.Interview.Outcome == "Completed" ~ "Completed",
                    Initial.Interview.Outcome == "Couldn't be reached" ~
                      "Lost to Follow-Up",
                    Initial.Interview.Outcome == "No attempt made" ~
                      "Lost to Follow-Up",
                    Initial.Interview.Outcome == "None" ~ "Awaiting Interview",
                    Initial.Interview.Outcome == "" & Status == "Closed" ~
                      "Lost to Follow-Up",
                    Initial.Interview.Outcome == "Refused to interview" ~
                      "Refused to interview",
                    Initial.Interview.Outcome == "Partially completed" ~
                      "Partially completed"
                  ),
                male = ifelse(Person.Account..Gender.. == "Male", 1, 0),
                completed_interview =
                  ifelse(interview_outcome == "Completed", 1, 0),
                refuse_interview =
                  ifelse(interview_outcome == "Refused to interview", 1, 0),
                refuse_provide_contacts = ifelse(
                  DO.YOU.HAVE.CLOSE.CONTACTS. %in%  c("Decline to answer"),
                  1,
                  0
                ),
                have_close_contacts =
                  ifelse(DO.YOU.HAVE.CLOSE.CONTACTS. == "Yes", 1, 0),
                Notes = str_to_lower(Notes),
                interp_used = ifelse(
                  str_detect(
                    Notes,
                    "translator|translater|interpret|language line"
                  ),
                  1,
                  0
                ),
                datetime_open = Date.Time.Opened %>%
                  gsub(",", "", .) %>%
                  as.POSIXct(., format = "%m/%d/%Y %I:%M %p"),
                datetime_comp = Interview.Completed.Date %>%
                  gsub(",", "", .) %>%
                  as.POSIXct(., format = "%m/%d/%Y %I:%M %p"),
                datetime_close = Date.Time.Closed %>%
                  gsub(",", "", .) %>%
                  as.POSIXct(., format = "%m/%d/%Y %I:%M %p"),
                time_initial =
                  difftime(datetime_comp, datetime_open, units = "days") %>%
                  as.numeric(),
                time_elapsed =
                  difftime(datetime_close, datetime_open, units = "days") %>%
                  as.numeric(),
                # define alternative time-related variables
                initial_one_day = ifelse(time_initial <= 1, 1, 0),
                initial_same_day =
                  ifelse(as.Date(datetime_open) == as.Date(datetime_comp), 1, 0)
              ) %>%
              dplyr::select(-Record.Number) %>%
              distinct(ID, .keep_all = TRUE)
```

```{r}
# list of language specialty team CICT confirmed by SCC PHD (email),
# Mar. 2, 2021
specialty_team <- readRDS("../data/specialty_team.rds")
primary <- readRDS("../data/primary.rds")

secondary <- setdiff(specialty_team, primary)

# merge CICT data to list of assigned + holdout cases by CalConnect ID
cases <-
  left_join(cases_all, df_cict, by = "ID") %>%
  # Exclude all cases with secondary tracers.
  filter(
    !(`CICT Staff Name` %in% secondary | `Record.Owner` %in% secondary)
  ) %>%
  mutate(
    # We prefer the internal spreadsheet and
    # fall back to CalConnect for these outcomes.
    completed_interview =
      ifelse(
        is.na(completed_interview.x) | completed_interview.x == 0,
        completed_interview.y,
        completed_interview.x
      ),
    refuse_interview =
      ifelse(
        is.na(refuse_interview.x) | refuse_interview.x == 0,
        refuse_interview.y,
        refuse_interview.x
      ),
    interp_used =
      ifelse(
        is.na(interp_used.x) | interp_used.x == 0,
        interp_used.y,
        interp_used.x
      ),
    # define dummy variable to indicate assigned vs. holdout
    Z = ifelse(group == "assigned", 1, 0),
    # define variable to indicate Spanish-speaking CICT
    D = ifelse(
      `CICT Staff Name` %in% specialty_team |
        `Record.Owner` %in% specialty_team,
      1,
      0
    ),
    # define unique batch-date identifier
    date_batch = paste0(Date, "_", time)
  ) %>%
  dplyr::select(
    -completed_interview.x,
    -completed_interview.y,
    -refuse_interview.x,
    -refuse_interview.y,
    -interp_used.x,
    -interp_used.y
  )

# Manual correction for cases that had both `completed_interview` and
# `refuse_interview` equal to 1.
cases[917, ]$completed_interview <- 0
cases[977, ]$refuse_interview <- 0

# Uncomment the following lines to define cases for the paper's sensitivity
# check, in which secondary LST members are included in the treatment
# definition.
# cases <-
#   left_join(cases_all, df_cict, by = "ID") %>%
#   mutate(
#     # we prefer the internal spreadsheet and fall back to CalConnect
#     # for these outcomes
#     completed_interview = ifelse(
#       is.na(completed_interview.x) | completed_interview.x == 0,
#       completed_interview.y,
#       completed_interview.x
#     ),
#     refuse_interview = ifelse(
#       is.na(refuse_interview.x) | refuse_interview.x == 0,
#       refuse_interview.y,
#       refuse_interview.x
#     ),
#     interp_used = ifelse(
#       is.na(interp_used.x) | interp_used.x == 0,
#       interp_used.y,
#       interp_used.x
#     ),
#     # define dummy variable to indicate assigned vs. holdout
#     Z = ifelse(group == "assigned", 1, 0),
#     # define variable to indicate Spanish-speaking CICT
#     D = ifelse(
#       `CICT Staff Name` %in% specialty_team |
#         `Record.Owner` %in% specialty_team,
#       1,
#       0
#     ),
#     # define unique batch-date identifier
#     date_batch = paste0(Date, "_", time)
#   ) %>%
#   dplyr::select(
#     -completed_interview.x,
#     -completed_interview.y,
#     -refuse_interview.x,
#     -refuse_interview.y,
#     -interp_used.x,
#     -interp_used.y
#   )
#
# # Manual correction for cases that had both `completed_interview` and
# # `refuse_interview` equal to 1.
# cases[764,]$completed_interview <- 0
# cases[1015,]$completed_interview <- 0
# cases[1077,]$refuse_interview <- 0
```

Get count of contacts per case.
```{r}
parent_record_number <- read_csv("/share/pi/deho/parent_record_number.csv")
NumberContactsElicited <-
  parent_record_number %>%
  group_by(Parent.Record.Number) %>%
  count() %>%
  filter(Parent.Record.Number != 0) %>%
  rename(
    Record.Number = Parent.Record.Number,
    NumberContactsElicited = n)
parent_record_number <-
  parent_record_number %>%
  left_join(NumberContactsElicited) %>%
  mutate(
    NumberContactsElicited =
      ifelse(
        is.na(NumberContactsElicited),
        0,
        NumberContactsElicited
      ),
    Elicit_atleast1_contact =
      ifelse(
        NumberContactsElicited > 0,
        1,
        0
      )
  ) %>%
  mutate(ID = as.numeric(Record.Number)) %>%
  dplyr::select(
    ID,
    NumberContactsElicited,
    Elicit_atleast1_contact
  )
cases <- cases %>%
  left_join(parent_record_number, by = "ID")
```

Add in CBG-level balance variables.
```{r}
cases <- cases %>%
              rename(CBG = GEOID)

svi <- read_csv("../data/scc_cbg_svi.csv")
hh <- read_rds("../data/scc_cbg_hh_size.rds") %>%
  mutate(CBG = as.numeric(CBG))

cases <- cases %>%
  left_join(svi, by = "CBG") %>%
  left_join(hh, by = "CBG")
```

```{r}
# save baseline dataset for evaluation purposes
write_csv(cases, "../data/language_matching_eval_data.csv")
# write_csv(cases, "../data/language_matching_eval_data_sensitivity.csv")
```


Outcome Evaluation

```{r}
# import data
# sensitivity analysis: language_matching_eval_data_sensitivity.csv
cases <- read_csv("../data/language_matching_eval_data.csv") # n = 3106, 3258

# Join with SVI attributes at the CBG level.
svi <- read_csv("/share/pi/deho/Language_Matching_Pilot/scc_cbg_svi.csv")
hh <- read_rds("/share/pi/deho/Language_Matching_Pilot/scc_cbg_hh_size.rds") %>%
  mutate(CBG = as.numeric(CBG))

cbg_covariates <- unique(append(names(svi), names(hh)))[-1]

balance_variables <- append(
  c(
    "perc_spanish",
    "male"
  ),
  cbg_covariates
)

outcome_variables <- c(
  "interp_used",
  "completed_interview",
  "refuse_interview",
  "refuse_provide_contacts",
  "time_initial",
  "time_elapsed",
  "initial_one_day",
  "initial_same_day",
  "NumberContactsElicited",
  "Elicit_atleast1_contact"
)
```

Missingness Tables
```{r}
# As randomized
# filter out batches with only 1 control or treated case
insufficient_batches <- cases %>%
  group_by(date_batch) %>%
  summarize(n_0 = n() - sum(Z),
            n_1 = sum(Z)) %>%
  filter(!(n_0 >= 2 & n_1 >= 2)) %>%
  pull(date_batch)

df_randomized <- cases %>%
  filter_at(all_of(balance_variables), all_vars(!is.na(.))) %>%
  filter(!(date_batch %in% insufficient_batches))

# number of cases in each group
sum(df_randomized$Z)
nrow(df_randomized) - sum(df_randomized$Z)

# two-sample t-tests for each covariate
t_tests <- tibble(
  mean_control = c(),
  mean_treatment = c(),
  se = c(),
  p.value = c()
)

for (variable in outcome_variables) {
  tt <- t.test(
    as.numeric(is.na(df_randomized[[variable]][df_randomized$Z == 0])),
    as.numeric(is.na(df_randomized[[variable]][df_randomized$Z == 1])),
    var.equal = F
  )
  t_tests <- rbind(
    t_tests,
    tibble(
      mean_control = c(tt$estimate[1]),
      mean_treatment = c(tt$estimate[2]),
      se = c(tt$stderr),
      p.value = c(tt$p.value))
    )
}

t_tests <- t_tests %>%
  round(digits = 2)

# create balance table template
bal_tab <- matrix(NaN, nrow = nrow(t_tests), ncol = ncol(t_tests))
colnames(bal_tab) <- c("mean (Z=0)", "mean (Z=1)", "se", "p-val")
rownames(bal_tab) <- outcome_variables

# fill in results from t-tests
for (i in seq_along(outcome_variables)) {
  bal_tab[i, ] <- c(
    t_tests[i, ]$mean_control,
    t_tests[i, ]$mean_treatment,
    t_tests[i, ]$se,
    t_tests[i, ]$p.value
  )
}

# formatting
bal_tab
```

Balance Tables
```{r}
# balance 'as randomized'
df_bal <- cases %>%
  filter_at(all_of(balance_variables), all_vars(!is.na(.)))
insufficient_batches <- df_bal %>%
  group_by(date_batch) %>%
  summarize(n_0 = n() - sum(Z, na.rm = T),
            n_1 = sum(Z, na.rm = T)) %>%
  filter(!(n_0 >= 2 & n_1 >= 2)) %>%
  pull(date_batch)
# filter out batches with only 1 control or treated case
df_bal <- df_bal %>%
  filter(!(date_batch %in% insufficient_batches))
# number of cases in each group
sum(df_bal$Z) # n (Z=1) 1541
nrow(df_bal) - sum(df_bal$Z) # n (Z=0) 1628
# two-sample t-tests for each covariate
t_tests <- tibble(mean_control = c(), mean_treatment = c(), p.value = c())
for (covariate in balance_variables) {
  tt <- t.test(
    df_bal[[covariate]][df_bal$Z == 0],
    df_bal[[covariate]][df_bal$Z == 1],
    var.equal = F
  )
  df <- cases %>%
      filter_at(all_of(balance_variables), all_vars(!is.na(.))) %>%
      group_by(date_batch) %>%
      summarize(
        n_0 = n() - sum(Z, na.rm = T),
        n_1 = sum(Z, na.rm = T),
        n_j = n(),
        cov_diff = mean(get(covariate)[Z == 1], na.rm = T) - mean(
          get(covariate)[Z == 0], na.rm = T
        ),
        s2_0 = var(get(covariate)[Z == 0], na.rm = T),
        s2_1 = var(get(covariate)[Z == 1], na.rm = T),
        se_j = s2_1 / n_1 + s2_0 / n_0
      ) %>%
      filter(n_0 >= 2 & n_1 >= 2)
    # block weights
    w <- df$n_j / sum(df$n_j)
    w2 <- (df$n_j / sum(df$n_j))^2
    # overall difference estimate
    overall_diff <- w %*% (df$cov_diff)
    # overall SE estimate
    se_hat <- sqrt(w2 %*% df$se_j)
    # print p-value
    z_hat <- abs(overall_diff / se_hat)
    t_tests <- rbind(
      t_tests,
      tibble(
        mean_control = c(tt$estimate[1]),
        mean_treatment = c(tt$estimate[2]),
        p.value = c(pnorm(-1 * z_hat, lower.tail = T) * 2)
      )
    )
}
# create balance table template
bal_tab <- matrix(NaN, nrow = length(balance_variables), ncol = 3)
colnames(bal_tab) <- c("mean (Z=0)", "mean (Z=1)", "p-val")
rownames(bal_tab) <- balance_variables
# fill in results from t-tests
for (i in seq_along(balance_variables)) {
  bal_tab[i, ] <- c(
    t_tests[i, ]$mean_control,
    t_tests[i, ]$mean_treatment,
    t_tests[i, ]$p.value
  )
}
# formatting
bal_tab %>% round(digits = 2)

itt_imbalanced_variables <- bal_tab %>%
  as_tibble(rownames = NA) %>%
  rownames_to_column(var = "variable") %>%
  filter(`p-val` < .1) %>%
  pull(variable)
```

```{r}
# balance 'as treated' (Z=0 & D=0 against Z=1 & D=1)
df_bal <- cases %>%
  filter_at(all_of(balance_variables), all_vars(!is.na(.))) %>%
  filter((D == 0 & Z == 0) | (D == 1 & Z == 1))

insufficient_batches <- df_bal %>%
  group_by(date_batch) %>%
  summarize(n_0 = n() - sum(Z, na.rm = T),
            n_1 = sum(Z, na.rm = T)) %>%
  filter(!(n_0 >= 2 & n_1 >= 2)) %>%
  pull(date_batch)

# filter out batches with only 1 control or treated case
df_bal <- df_bal %>%
  filter(!(date_batch %in% insufficient_batches))

# number of cases in each group
sum(df_bal$D)
nrow(df_bal) - sum(df_bal$D)
# two-sample t-tests for each covariate
t_tests <- tibble(mean_control = c(), mean_treatment = c(), p.value = c())
for (covariate in balance_variables) {
  tt <- t.test(
    df_bal[[covariate]][df_bal$D == 0],
    df_bal[[covariate]][df_bal$D == 1],
    var.equal = F
  )
  df <- cases %>%
      filter_at(all_of(balance_variables), all_vars(!is.na(.))) %>%
      filter((D == 0 & Z == 0) | (D == 1 & Z == 1)) %>%
      group_by(date_batch) %>%
      summarize(
        n_0 = n() - sum(Z, na.rm = T),
        n_1 = sum(Z, na.rm = T),
        n_j = n(),
        cov_diff = mean(get(covariate)[Z == 1], na.rm = T) - mean(
          get(covariate)[Z == 0], na.rm = T
        ),
        s2_0 = var(get(covariate)[Z == 0], na.rm = T),
        s2_1 = var(get(covariate)[Z == 1], na.rm = T),
        se_j = s2_1 / n_1 + s2_0 / n_0
      ) %>%
      filter(n_0 >= 2 & n_1 >= 2)
    # block weights
    w <- df$n_j / sum(df$n_j)
    w2 <- (df$n_j / sum(df$n_j))^2
    # overall difference estimate
    overall_diff <- w %*% (df$cov_diff)
    # overall SE estimate
    se_hat <- sqrt(w2 %*% df$se_j)
    # print p-value
    z_hat <- abs(overall_diff / se_hat)
    t_tests <- rbind(
      t_tests,
      tibble(
        mean_control = c(tt$estimate[1]),
        mean_treatment = c(tt$estimate[2]),
        p.value = c(pnorm(-1 * z_hat, lower.tail = T) * 2)
      )
    )
}
# create balance table template
bal_tab <- matrix(NaN, nrow = length(balance_variables), ncol = 3)
colnames(bal_tab) <- c("mean (Z=0)", "mean (Z=1)", "p-val")
rownames(bal_tab) <- balance_variables
# fill in results from t-tests
for (i in seq_along(balance_variables)) {
  bal_tab[i, ] <- c(
    t_tests[i, ]$mean_control,
    t_tests[i, ]$mean_treatment,
    t_tests[i, ]$p.value
  )
}
# formatting
bal_tab %>% round(digits = 2)
```

```{r}
# balance 'as matched' with all covariates

df_bal <- cases %>%
        filter_at(all_of(balance_variables), all_vars(!is.na(.))) %>%
        filter((D == 0 & Z == 0) | (D == 1 & Z == 1))

# balance variable formula for the sensitivity analysis
# D ~ perc_spanish +
#                    EP_MUNIT * EP_NOVEH +
#                    EP_AGE65 * EP_MEDICARE +
#                    EP_LIMENG * EP_NOHSDP +
#                    EP_UNINSUR * EP_NOBROADBAND +
#                    EP_POV + log(EP_PCI)

# balance variable formula for the outcomes analysis
# D ~ perc_spanish +
#                    EP_MUNIT * EP_NOVEH +
#                    EP_AGE65 * EP_MEDICARE +
#                    EP_LIMENG * EP_NOHSDP +
#                    EP_UNINSUR * EP_NOBROADBAND +
#                    EP_DIRECTINS * EP_SNAP

m_out <- matchit(D ~ perc_spanish +
                   EP_MUNIT * EP_NOVEH +
                   EP_AGE65 * EP_MEDICARE +
                   EP_LIMENG * EP_NOHSDP +
                   EP_UNINSUR * EP_NOBROADBAND +
                   EP_DIRECTINS * EP_SNAP,
                  data = df_bal, estimand = "ATT",
                 distance = "glm", link = "logit", method = "nearest",
                 replace = F, exact = ~ date_batch, discard = "control")
# balance table (p-values) for matched dataset with all covariates

# extract match matrix to compute p-values
ctr_index <- as.numeric(m_out$match.matrix) %>% na.omit()

df_trt <- df_bal %>% filter(D == 1)
trt_index <- which(!is.na(m_out$match.matrix))

# two-sample t-tests for each covariate
t_tests <- tibble(mean_control = c(), mean_treatment = c(), p.value = c())
for (covariate in balance_variables) {
  tt <- t.test(
    df_bal[[covariate]][ctr_index],
    df_trt[[covariate]][trt_index],
    var.equal = F
  )
  t_tests <- rbind(
    t_tests,
    tibble(
      mean_control = c(tt$estimate[1]),
      mean_treatment = c(tt$estimate[2]),
      p.value = c(tt$p.value)
    )
  )
}

# create balance table template
bal_tab <- matrix(NaN, nrow = length(balance_variables), ncol = 3)
colnames(bal_tab) <- c("mean (D=0)", "mean (D=1)", "p-val")
rownames(bal_tab) <- balance_variables

# fill in results from t-tests
for (i in seq_along(balance_variables)) {
    bal_tab[i, ] <- c(
      t_tests[i, ]$mean_control,
      t_tests[i, ]$mean_treatment,
      t_tests[i, ]$p.value
    )
}

# formatting
bal_tab %>% round(digits = 2)
```

Block-based Estimation

```{r}
# calculate treatment effect estimate for variables of interest
as_randomized_res <- NULL
for (variable in outcome_variables) {
  as_randomized_res <- rbind(
    as_randomized_res,
    get_treatment_effect(
      cases %>%
        filter_at(all_of(balance_variables), all_vars(!is.na(.))),
      "as randomized",
      variable
    )
  )
}
# create balance table template
res_tab <- matrix(NaN, nrow = length(outcome_variables), ncol = 5)
colnames(res_tab) <- c("mu_0", "mu_1", "effect", "se", "p-val")
rownames(res_tab) <- outcome_variables

# fill in results
for (i in seq_along(outcome_variables)) {
    res_tab[i, ] <- c(
      as_randomized_res[i, ]$mu_0,
      as_randomized_res[i, ]$mu_1,
      as_randomized_res[i, ]$ate_hat,
      as_randomized_res[i, ]$se_hat,
      as_randomized_res[i, ]$p_val
    )
}

# formatting
res_tab %>% round(digits = 2)

as_treated_res <- NULL
for (variable in outcome_variables) {
  as_treated_res <- rbind(
    as_treated_res,
    get_treatment_effect(
      cases %>%
        filter_at(all_of(balance_variables), all_vars(!is.na(.))),
      "as treated",
      variable
    )
  )
}
# create balance table template
res_tab <- matrix(NaN, nrow = length(outcome_variables), ncol = 5)
colnames(res_tab) <- c("mu_0", "mu_1", "effect", "se", "p-val")
rownames(res_tab) <- outcome_variables

# fill in results
for (i in seq_along(outcome_variables)) {
    res_tab[i, ] <- c(
      as_treated_res[i, ]$mu_0,
      as_treated_res[i, ]$mu_1,
      as_treated_res[i, ]$ate_hat,
      as_treated_res[i, ]$se_hat,
      as_treated_res[i, ]$p_val
    )
}

# formatting
res_tab %>% round(digits = 2)

as_matched_res <- NULL
for (variable in outcome_variables) {
  as_matched_res <- rbind(
    as_matched_res,
    get_treatment_effect(
      m_out,
      "as matched",
      variable
    )
  )
}
# create balance table template
res_tab <- matrix(NaN, nrow = length(outcome_variables), ncol = 5)
colnames(res_tab) <- c("mu_0", "mu_1", "effect", "se", "p-val")
rownames(res_tab) <- outcome_variables

# fill in results
for (i in seq_along(outcome_variables)) {
    res_tab[i, ] <- c(
      as_matched_res[i, ]$mu_0,
      as_matched_res[i, ]$mu_1,
      as_matched_res[i, ]$ate_hat,
      as_matched_res[i, ]$se_hat,
      as_matched_res[i, ]$p_val
    )
}

# formatting
res_tab %>% round(digits = 2)
```

CACE Analysis
```{r}
insufficient_batches <- cases %>%
  filter_at(all_of(balance_variables), all_vars(!is.na(.))) %>%
  group_by(date_batch) %>%
  summarize(n_0 = n() - sum(Z),
            n_1 = sum(Z)) %>%
  filter(!(n_0 >= 2 & n_1 >= 2)) %>%
  pull(date_batch)

df_cace <- cases %>%
  filter_at(all_of(balance_variables), all_vars(!is.na(.))) %>%
  filter(!(date_batch %in% insufficient_batches))

# first stage results
# lm_robust uses same robust standard error estimator as iv_robust (HC2)
# see: https://www.rdocumentation.org/packages/estimatr/versions/0.30.2/topics/iv_robust
mod_fs <- lm_robust(
  D ~ Z,
  fixed_effects = ~ date_batch,
  clusters = date_batch,
  data = df_cace
)
summary(mod_fs)

# test for weak instrument
# basic rule of thumb (except in small samples) is F-statistic > 10 for a
# 'relevant' instrument (i.e. Z has a non-zero "effect" on D)
mod_1 <- lm(D ~ Z + factor(date_batch), data = df_cace)
mod_0 <- lm(D ~ factor(date_batch), data = df_cace)
# lm_robust uses HC2 robust standard errors (see above link)
waldtest(mod_1, mod_0, vcov = vcovHC(mod_1, type = "HC2"))
cace_res <- NULL

for (variable in outcome_variables) {
  # control for imbalanced variables
  mod <- iv_robust(
    as.formula(
      paste0(
        variable,
        " ~ D + ",
        str_c(balance_variables, collapse = " + "),
        " | Z + ",
        str_c(balance_variables, collapse = " + ")
      )
    ),
    data = df_cace,
    clusters = date_batch,
    fixed_effects = ~ date_batch
  )
  ate_hat <- mod$coefficients[1]
  se_hat <- mod$std.error[1]
  p_val <- mod$p.value[1]

  cace_res <- rbind(
    cace_res,
    tibble(effect = c(ate_hat), se = c(se_hat), p_val = c(p_val))
  )
}

# create balance table template
res_tab <- matrix(NaN, nrow = length(outcome_variables), ncol = 3)
colnames(res_tab) <- c("effect", "se", "p_val")
rownames(res_tab) <- outcome_variables

# fill in results
for (i in seq_along(outcome_variables)) {
    res_tab[i, ] <- c(
      cace_res[i, ]$effect,
      cace_res[i, ]$se,
      cace_res[i, ]$p_val
    )
}

# formatting
res_tab %>% round(digits = 2)
```

